---
title: "Analysis code for Designing Passwords for Web Survey Access: The Effects of Password Length and Complexity on Survey and Panel Recruitment"
author: "Georg-Christoph Haas, Marieke Volkert, Stefan Zins"
output: 
  html_document:
    toc: true
    toc_float: true
---

# Abstract
Online probability panels that recruit participants via postal invitation letters use passwords to manage access to the survey. While previous research has examined primarily whether providing a password affects response rates, less attention has been given to the impact of password strength, defined by length and complexity, on response propensities. Password length refers to the number of characters in a password, while complexity refers to the set of characters (e.g., lowercase letters, digits). This study evaluates the influence of password length and complexity on various participation levels (i.e., survey access, response rates, and panel registration) as well as the propensity to consent to data linkage and item response rates for income questions. We conducted an experiment in the first wave of a German online probability survey and manipulated password length and complexity. Additionally, we included a group using the default length and complexity settings (eight uppercase letters) of the survey hosting service. The participants were randomly assigned to one of these groups. The findings indicate that longer and more complex passwords increase both participation rates and the propensity to consent to data linkage between survey and administrative data.



```{r setup, include=FALSE}

#save.image(paste0(DATA, "geofence.RData"))
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

#***************************************************
# load packages
#***************************************************


library(here)
library(dplyr)
library(tidyr)
library(readstata13)
library(scales)
library(ggplot2)
library(knitr)
library(readxl)
library(margins)
library(kableExtra)
library(lubridate)
library(stargazer)
library(ggtext)
library(survey)
library(magrittr)
library(margins)

#***************************************************
# load user-build functions
#***************************************************

save_plot <- function(x) {  # saves images in correct folder
  
  ggsave(paste0(here(), "/03_IMG/", x , ".png"),
         width = 15, height = 19, units="cm")
}  

## some code snippets I need a lot put in functions
scramble_pw <- function(DATA) {
  #this one scrambles the passwords of the summary tables.
  
  DATA %>%
  rowwise() %>%
  mutate(password = paste(sample(strsplit(password, "")[[1]]), collapse = ""))

}

MY_KABLE <- function(DATA, COL = NA) {
  
  #this is the default design for the summary tables I chose
  
  DATA %>%
  kable(col.names = COL) %>%
  kable_styling(bootstrap_options = "striped",
                    full_width = T)
  
}

#make it easy to save and print results in the same order as in the paper
#---------------------------------------------------------------
#create a running number
#a list that contains all the stats of the in order they appear.
statx <- 0
stats_paper <- list()

##assign like this
save_stat <- function(OBJECT){
  
  statx <<- statx +1
  stats_paper[[statx]] <<- OBJECT
  
  #return(statx)
  #return(stats_paper)
}


##print like this
write_stat <- function(ADJUST=0){

  #provide a value for adjust, if you generate more than one stat for code chunk.
  
  return(stats_paper[[max(statx)-ADJUST]])  
  
}


chi <- function(DATA, VAR){
  
  chi_teil <- chisq.test(table(DATA[["grp"]], DATA[[VAR]]))

  res_chi_teil <- paste0("$X^2$ (", chi_teil$parameter, ", N = ", nrow(DATA), ") = ", round(chi_teil$statistic,1), ", p = ", sub("^0", "", sprintf("%.3f", chi_teil$p.value)))

  res_chi_teil
  
  
}



```

```{r load and edit data}

# Spezialdatensatz_exp01-02_v2.csv 
# -- received: 29.02.2024
# -- codebook in Codebook_v2.html (in German)
# -- contains data for access rate, response rate, registration and data linkage consent rate analysis.
# 
# ----------------------------------------------------------------------------
# 
# opalid_passwords.xlsx
# -- contains match for opalid and passwords
# -- needed to join different datasets 
# -- made by Stefan Schwarz.
# 
# ----------------------------------------------------------------------------
# sample_HF_KOH1_pw_trach_.csv
# 
# -- assignment of experimental groups to sample cases
# -- experimental group (grp) -- 0-5
# -- sample (sample) -- ieb/lhg
# -- used password (pw) -- string 
# 
# -----------------------------------------------------------------------------
# 
# IAB-OPAL_Surveydaten_Core-Hauptfeld_Welle-1_v1.dta
# 
# -- contains the survey data
# -- is needed for item nonresponse analysis for the income question: 
# 	-- aa1ze450 - gross income
# 	-- aa1ze500 - net income
# 	-- joined with other datasets with opalid


pw <- read.csv2(paste0(here(), "/05_DATA/sample_HF_KOH1_pw_trach_.csv") )

opal_id <- readxl::read_xlsx(paste0(here(), "/05_DATA/opalid_passwords.xlsx")) %>%
  left_join(pw, by = c("password" = "pw"))

df <- read.csv(paste0(here(), "/08_ORIG/Spezialdatensatz_exp01-02_v2.csv")) %>%
  dplyr::select(-password_grp) %>%
  mutate(day = dmy(substr(START, nchar(START)- 9, nchar(START))),
         day_min = min(day, na.rm=TRUE),
         day0 = day-day_min, # day0 = day in the field.
         lc = ifelse(aa1za010 =="Ja, ich bin einverstanden", 1, 0) # linkage consent
         )


response <- left_join(opal_id, df, by = "opalid") %>%
  mutate(teil = ifelse(is.na(STATUS), 0, 1),
         STATUS = ifelse(is.na(STATUS), 0, STATUS),
         teil_voll = case_when(grepl("Voll", UN) ~ 1),
         teil_voll = ifelse(is.na(teil_voll), 0,teil_voll),
         reg = ifelse(UN == "Vollbefragung, panelconsent, registrierungsbereit, registriert", 1,0), 
         simple = factor(ifelse(grp == 0 | grp == 1 | grp == 3, "simple", "complex"),
                         levels = c("simple", "complex")),
         length = ifelse(grp == 0, "8 chars",
                         ifelse(grp == 1 | grp == 2, "5 chars", "11 chars")),
         length = factor(length, levels = c("8 chars", "5 chars", "11 chars")),
         ausbild = ifelse(is.na(ausbild), 6, ausbild),#categorize missings
         druckerei = ifelse(brtranche_ba %in% c(1,2), "system","haus"))  
        
  
#analysis datasets for employee sample(ieb) and welfare recipient sample (lhg)
response_ieb <- filter(response, sample == "ieb")
response_lhg <- filter(response, sample == "lhg")

# dataset for income analysis
w1 <- read.dta13(paste0(here(), "/08_ORIG/IAB-OPAL_Surveydaten_Core-Hauptfeld_Welle-1_v1.dta"), convert.factors = FALSE)
w1$opalid <- as.numeric(w1$p_opalid)

```

```{r author build functions}

#this part builds functions for the analysis. Mostly to get the format right and avoid copy-pasting table together from different sources.

#Function to edit the data by outcomes survey access, completion, panel registration and linkage consent
rate_edit <- function(DATA, RATE_VAR){

rate <-
DATA %>%
  filter(!is.na(day0)) %>%
  group_by(grp, day0) %>%
  summarize(count = sum(.data[[RATE_VAR]])) %>%
  group_by(grp) %>%
  mutate(cumsum = cumsum(count)) %>%
  left_join(group_by(DATA, grp) %>% summarize(n = n()), by = "grp") %>%
  mutate(prop = cumsum/n,
         rr = prop*100,
         se = sqrt(prop*(1-prop)/n)*100) %>%
  left_join(dplyr::select(distinct(response_ieb, grp, .keep_all=TRUE), grp, password), by = "grp") %>%
  rowwise() %>%
  mutate(up = prop.test(cumsum, n, conf.level = 0.95)$conf.int[2]*100,
         do = prop.test(cumsum, n, conf.level = 0.95)$conf.int[1]*100
         )

rate
}

p_star <- function(P) {
  
  ifelse(P < 0.001, "***",
         ifelse(P<0.01, "**",
                ifelse(P<0.05, "*", "")))
  
}

#provides descriptive statistics for the rate of interest
rate_des <- function(DATA, KABLE = TRUE) {

  lbl_grp <- c("control", "short, simple", "short, complex", "long, simple", "long, complex")
  
  stat <-
  DATA %>%
    group_by(grp) %>%
    mutate(daymax = max(day0)) %>%
    ungroup() %>%
    #filter(day0 == "5 days") %>%
    filter(day0 == daymax) %>%
    cbind(lbl_grp) %>%
    mutate(diff = round(rr - .data$rr[.data$grp==0],1),
           c =  .data$prop[.data$grp==0],
           cn = .data$n[.data$grp==0],
           se_diff = round(sqrt(prop * (1 - prop) /n +  
                                  c*(1 - c) / cn)*100,1),
           rate_rr = paste0(round(rr,1)),
           se_rr = round(se,1),
           lbl_grp = paste0(lbl_grp, " (N=", n, ")")) %>%
    rowwise() %>%
    mutate(chi = sqrt(prop.test(c(prop*n, c*cn), c(n, cn))$statistic), #chi to z statistic adjustment
           p = prop.test(c(prop*n, c*cn), c(n, cn))$p.value
    ) %>%
    ungroup() %>% 
    mutate(p = sub("^0", "", sprintf("%.3f", p))) %>%
    dplyr::select(password, lbl_grp, rate_rr, se_rr, diff, se_diff, p)

  
  if(KABLE == TRUE){ 
    stat %>% 
      kable() %>%
      kable_styling(bootstrap_options = "striped",
                    full_width = T)
  }else{
    stat
  }

}

rate_pn <- function(DATA) {
  
  lbl_grp <- c("control", "short, simple", "short, complex", "long, simple", "long, complex")
  
  stat <-
  DATA %>%
    group_by(grp) %>%
    mutate(daymax = max(day0)) %>%
    ungroup() %>%
    #filter(day0 == "5 days") %>%
    filter(day0 == daymax) %>%
    cbind(lbl_grp) %>%
    dplyr::select(lbl_grp, grp, x = cumsum, n)
  
  
  stat
}

```

# Data

## Response rates

```{r}
#response rates
save_stat(round(mean(response_ieb$teil_voll)*100,1))
save_stat(round(mean(response_lhg$teil_voll)*100,1))
```

Response rate employee sample: `r write_stat(1)`\
Response rate welfare recipient sample: `r write_stat()`

## Panel Registration rates

```{r}
#response rates
save_stat(round(sum(response_ieb$reg)/sum(response_ieb$teil_voll)*100,1))
save_stat(round(sum(response_lhg$reg)/sum(response_lhg$teil_voll)*100,1))
```

Panel registration and consent rate employee sample: `r write_stat(1)`\
Panel registration and consent welfare recipient sample: `r write_stat()`


## Table S1: sample characteristics by sample frame in percent.

```{r}

my_prop <- function(DATA, VAR) {
  sprintf("%.1f", prop.table(table(DATA[[VAR]], DATA[["erwerb"]]))*100)
}

sample_props <-
cbind(
#labels
  c("**Educational attainment**", "No degree", "Vocational training", "High School diploma", "High School diploma\nand vocational training", "Higher education degree", "No information",
                                              "**Age**", "18-28", "29-38", "39-48", "49-59", "59-64",
                                               "**Sex**", "Male", "Female",
                                       "**Citizenship**", "Other", "German"),
#ieb data
  c(
c("", my_prop(response_ieb, "ausbild")),
c("", my_prop(response_ieb, "agegr")),
c("", my_prop(response_ieb, "sex")),
c("", my_prop(response_ieb, "staat"))
),
#lhg data
  c(
c("", my_prop(response_lhg, "ausbild")),
c("", my_prop(response_lhg, "agegr")),
c("", my_prop(response_lhg, "sex")),
c("", my_prop(response_lhg, "staat"))
)) %>%
  as.data.frame()

names(sample_props) <- c("Variable", 
                              paste0("Employees (N = ",nrow(response_ieb), ")"),
                              paste0("Welfare recipients (N = ", nrow(response_lhg), ")")) 

sample_props %>%
  MY_KABLE()  


```


# Results

## Access, complete response and panel registration rates

### Table 2. Access, Complete Response, and Panel Registration Rates by Experimental Group in the Employee Sample


```{r}


des_ieb <- 
left_join(
  rate_des(rate_edit(response_ieb, "teil"), KABLE = FALSE),
  rate_des(rate_edit(response_ieb, "teil_voll"), KABLE = FALSE), 
  by = c("password", "lbl_grp"),
  suffix = c(".access", ".complete")
  ) %>%
  left_join(
  rate_des(rate_edit(response_ieb, "reg"), KABLE = FALSE), 
  by = c("password", "lbl_grp"),
  suffix = c("", ".reg")) %>%
  scramble_pw() 

des_ieb %>%
  MY_KABLE() %>%
  add_footnote(paste0("Survey Complete: ", chi(response_ieb, "teil_voll")), notation = "none", escape = FALSE) %>%
  add_footnote(paste0("Survey Access: ", chi(response_ieb, "teil")) ,notation = "none", escape = FALSE) %>%
  add_footnote(paste0("Panel Registration: ", chi(response_ieb, "reg")), notation = "none", escape = FALSE)
              
  

```

```{r warning=FALSE, results='asis'}
options(dplyr.summarise.inform = FALSE)

#einzelne Tests

#function to get the correct format for the thest of proportion
proptest <- function(DATA, VAR, GRP){
  
  X <- filter(rate_pn(rate_edit(DATA, VAR)), grp %in% GRP)
  N <- filter(DATA, grp %in% GRP)
  
  
  Y <- prop.test(X$x, X$n)
  
  prop <- paste0("z=", round(sqrt(Y$statistic),1), ", p=", sprintf("%.3f", Y$p.value)) #chi to z statistic adjustment

  prop
}  

```


### In text statistics to describe Table 2

```{r}
#control vs. short/simple
save_stat(proptest(response_ieb, "teil", c(0,1)))

#control vs. long/simple
#access
save_stat(proptest(response_ieb, "teil", c(0,3)))


#response rate
save_stat(proptest(response_ieb, "teil_voll", c(0,3)))


#panel registration rate
save_stat(proptest(response_ieb, "reg", c(0,3)))

```

**Test** control vs. short/simple: `r write_stat(3)`

**Test** control vs. long/simple:\
- access: `r write_stat(2)`\
- response rate: `r write_stat(1)`\
- panel registration rate:`r write_stat()`

### Table S3. Complete Response, and Panel Registration Rates by Experimental Group in the Employee Sample among those who accessed the survey or those who completed the survey {.tabset}

Does the increase from panel registration result from the password design or are panel registration rates higher because of the increased response rate? 


```{r}

fun_among <- function(DATA) {
 
  lbl = c("control", "short, simple", "short, complex", "long, simple", "long, complex")

#completion and registration rate among those who accessed
tab_among <- 
cbind(
  filter(DATA, teil ==1) %>%
    group_by(grp) %>%
    summarize(N_access = n()),
  rate_des(rate_edit(filter(DATA, teil ==1), "teil_voll"), KABLE = FALSE) %>%
    dplyr::select(completion = rate_rr, completion_diff = diff),
  rate_des(rate_edit(filter(DATA, teil ==1), "reg"), KABLE = FALSE) %>%
    dplyr::select(reg = rate_rr, reg_diff = diff)
) %>%
  dplyr::select(-grp) %>%
#registration rate among those who completed the survey
cbind(
  filter(DATA, teil_voll ==1) %>%
    group_by(grp) %>%
    summarize(N_comp = n()),
  rate_des(rate_edit(filter(DATA, teil_voll ==1), "reg"), KABLE = FALSE) %>%
    dplyr::select(reg_con_comp = rate_rr, reg_con_comp_diff = diff)) %>%
  dplyr::select(-grp) %>%
  t()

row.names(tab_among) <- c("N", "% (s.e.)", "Diff in %-points (s.e.) p-value", "% (s.e.)", "Diff in %-points (s.e.) p-value", "N", "% (s.e.)", "Diff in %-points (s.e.) p-value")

#final table 
tab_among %>%
  MY_KABLE(COL = c("", lbl))  %>%
  pack_rows("Among those who accessed the survey", 1,5) %>%
  pack_rows(paste0("Completion rate, ", chi(filter(DATA, teil == 1), "teil_voll")), 2,3) %>%
  pack_rows(paste0("Registration rate, ", chi(filter(DATA, teil == 1), "reg")), 4,5) %>%
  pack_rows("Among those who completed the survey", 6,8) %>%
  pack_rows(paste0("Registration rate, ", chi(filter(DATA, teil_voll == 1), "reg")), 7,8)
  
   
}


```

```{r}
#print Table S3
fun_among(response_ieb)

```


### Chisquared test

Testing is the password design affects the response rate among those who accessed the survey.

```{r}

chi(filter(response_ieb, teil == 1), "teil_voll") %>%
  save_stat()

```

`r write_stat()`

Testing is the password design affects the panel registration rate among those who accessed the survey or completed the survey.

```{r}

## "survey accessed"
save_stat(chi(filter(response_ieb, teil == 1), "reg"))


## "survey completed")
save_stat(chi(filter(response_ieb, teil_voll == 1), "reg"))


```

**Test** survey accessed `r write_stat(1)` **Test** survey completed `r write_stat()`

### Table 3. Access, Complete Response, and Panel Registration Rates by Experimental Group in the Welfare Recipient Sample

```{r}

left_join(
  rate_des(rate_edit(response_lhg, "teil"), KABLE = FALSE),
  rate_des(rate_edit(response_lhg, "teil_voll"), KABLE = FALSE), 
  by = c("password", "lbl_grp"),
  suffix = c(".access", ".complete")
  ) %>%
  left_join(
  rate_des(rate_edit(response_lhg, "reg"), KABLE = FALSE), 
  by = c("password", "lbl_grp"),
  suffix =c("", ".reg")) %>%
  scramble_pw() %>%
  MY_KABLE() %>%
    add_footnote(paste0("Survey Complete: ", chi(response_lhg, "teil_voll")), notation = "none", escape = FALSE) %>%
  add_footnote(paste0("Survey Access: ", chi(response_lhg, "teil")) ,notation = "none", escape = FALSE) %>%
  add_footnote(paste0("Panel Registration: ", chi(response_lhg, "reg")), notation = "none", escape = FALSE)




```


### Table

Table S4: Complete Response, and Panel Registration Rates by Experimental Group in the Welfare Recipient Sample among those who accessed the survey or those who completed the survey

Does the increase from panel registration result from the password design or are panel registration rates higher because of the increased response rate?


```{r}
#print table
fun_among(response_lhg)

```

### chisquared test

Testing is the password design affects the response rate among those who accessed the survey.

```{r}
chi(filter(response_lhg, teil == 1), "teil_voll") %>%
  save_stat()
```

`r write_stat()`

Testing if the password design affects the panel registration rate among those who accessed the survey or completed the survey.

```{r results = 'asis'}
### "survey accessed"
save_stat(chi(filter(response_lhg, teil == 1), "reg"))

## "survey completed"
save_stat(chi(filter(response_lhg, teil_voll == 1), "reg"))

```

**Test** survey accessed `r write_stat(1)`  
**Test** survey completed `r write_stat()`  
**Test** panel registration conditional on survey access: control vs short/simple : `r write_stat()`

## Logistic Regression (Main Effects) {.tabset}


```{r}

#we need those variables for many models.

#independent variables
iv <- "factor(grp) + factor(erwerb) + factor(ausbild) + factor(sex)  + factor(staat) + factor(regd) + factor(agegr) + factor(Anbieter) + factor(druckerei)"

#margins vars
margin_vars <- c("grp", "erwerb", "ausbild",  "sex", "staat", "agegr")

```

```{r eval=FALSE, include=FALSE}

#average marginal effects for access, response, reg

#access
md <- glm(paste0("teil ~ ", iv), 
          data = response, 
          family = binomial)

ame_main_teil <- summary(margins(md, variables = margin_vars))
saveRDS(ame_main_teil, paste0(here(), "/05_DATA/ame_main_teil.rds"))

#response
md <- glm(paste0("teil_voll ~", iv),
          data = response, 
          family = binomial)

ame_main_teil_voll <- summary(margins(md, variables = margin_vars))
saveRDS(ame_main_teil_voll, paste0(here(), "/05_DATA/ame_main_teil_voll.rds"))


#registration
md <- glm(paste0("reg ~ ", iv),
          data = response, 
          family = binomial)

ame_main_reg <- summary(margins(md, variables= margin_vars))
saveRDS(ame_main_reg, paste0(here(), "/05_DATA/ame_main_reg.rds"))

## lets see if passwords have an effect on registration rates within respondents
md <- md <- glm(paste0("reg ~ ", iv), 
                data = filter(response, teil_voll == 1), 
                family = binomial)

ame_main_reg_voll <- summary(margins(md, variables= margin_vars))
saveRDS(ame_main_reg, paste0(here(), "/05_DATA/ame_main_reg_voll.rds"))


#combine everything into one dataset for the figure
ame_main <- rbind( 
mutate(ame_main_teil, var = "Access"),
mutate(ame_main_teil_voll, var = "Complete interview"),
mutate(ame_main_reg, var = "Registrated as Panelist"))

saveRDS(ame_main, paste0(here(), "/05_DATA/ame_main.rds"))

ame_main %>%
  mutate(stars = p_star(p),
         AME = round(AME, 3),
         SE = round(SE, 3)) %>%
  write.csv2(paste0(here(), "/05_DATA/ame_main.csv"))


```

```{r}
ame_main <- readRDS(paste0(here(), "/05_DATA/ame_main.rds"))

refs <- c("agegr1", "ausbild1", "erwerb1", "grp0", "sex0", "staat0")

ame_main <- full_join(ame_main, data.frame(factor = refs))

old_var <- names(table(ame_main$factor))
new_var <- c("**Age (Ref: 18-28)**", "29-38", "39-48", "49-59", "59-64",
             "**Educational attainment**<br>**(Ref: No degree)**", "Vocational training", "High School diploma", "High School diploma<br>and vocational training", "Higher education degree", "No information",
             "**Employment Status (Ref: Employees)**", "Welfare Recipients",
             "**Password strength (Ref: Control)**", "Short, simple", "Short, complex", "Long, simple", "Long, complex",
             "**Sex (Ref: Male)**", "Female",
             "**Citizenship (Ref: Other)**", "German"
             )

# loop to recode labels
for (x in seq(1,length(new_var),1)){
  
  ame_main$factor <- ifelse(ame_main$factor == old_var[x], new_var[x], ame_main$factor)
}

ame_main$factor <- factor(ame_main$factor, levels = 
                            c("**Password strength (Ref: Control)**", "Short, simple", "Short, complex", "Long, simple", "Long, complex",
                               "**Employment Status (Ref: Employees)**", "Welfare Recipients",
                              "**Educational attainment**<br>**(Ref: No degree)**", "Vocational training", "High School diploma", "High School diploma<br>and vocational training", "Higher education degree", "No information",
                              "**Age (Ref: 18-28)**", "29-38", "39-48", "49-59", "59-64",
                              "**Sex (Ref: Male)**", "Female",
                              "**Citizenship (Ref: Other)**", "German"
                               ))



```


### Figure 1 from the Paper

Figure 1. Average marginal effects (points) and 95% confidence intervals (lines) from logistic regression models over different participation levels (access, complete interview, panel registration). The corresponding average marginal effect table can be found in Supplementary Files, Table S5. 

```{r fig.width=unit(10, "cm"), fig.height=unit(14, "cm"), warning=FALSE}

ggplot(ame_main, aes(x = as.numeric(AME), y = factor, group = var, shape = var)) +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0.2, position = position_dodge(width = -0.5), 
                color = "grey") +
  geom_point(position = position_dodge(width = -0.5)) +
  geom_vline(aes(xintercept=0), linetype= "dashed", color = "grey") +
  scale_y_discrete(limits=rev) +
  scale_shape_manual(values = c(15,16,23),
                     name = "Participation level",
                     na.translate = FALSE) +
  theme_bw() +
  #theme(axis.text = element_text(size=15,family="Comic Sans MS"))
  ylab("") +
  xlab("Average Marginal Effects (AME) with 95% Confidence Interval") +
  theme(axis.text.y = ggtext::element_markdown(),
        panel.border = element_blank(),
        axis.ticks = element_blank())


ggsave(paste0(here(), "/07_IMG/", "Haas 24-0199.R2 Figure 1" , ".png"),
         width = 16, height = 18, units="cm")

# ggsave(paste0(here(), "/07_IMG/", "Haas 24-0199.R2 Figure 1" , ".png"),
#          width = 10, height = 11.25, units="cm")


```

### Table S5 for Supplementary Material

Table S5. Average marginal effects, Standard errors (s.e.) and p-values logistic regression models over different participation levels (Access, complete interview, panel registration – corresponding table to figure 1 in main text

AME (standard error), p-value

```{r}

func <- function(x) {
  ifelse(is.na(x), "", x)
}

ame_main %>%
  #mutate_all(ifelse(is.na(.x), "-", .x)) %>%
  mutate(stat = paste0(round(AME, 3), " (", round(SE, 3), ") , ",   ifelse(p < 0.001, "<0.001", round(p,3))),
         stat = ifelse(is.na(stat), "", stat)) %>%
  dplyr::select(factor, stat, var) %>%
  spread(key = var, value = stat) %>%
  dplyr::select(-"<NA>") %>%
  mutate_at(vars(-factor), func) %>%
  MY_KABLE() 
  #kable()
         
```

## Logistic regression: effect within subgroups

```{r eval=FALSE, include=FALSE}

iv_inter <- "factor(grp)*(factor(erwerb) + factor(ausbild) + factor(sex)  + factor(staat) + factor(regd) + factor(agegr)) + factor(Anbieter) + factor(druckerei)"


md_ame_teil <- glm(paste0("teil ~  ", iv_inter), 
                   data = response, family = binomial)

md_ame_teil_voll <- glm(paste0("teil_voll ~", iv_inter), 
                               data = response, family = binomial)


md_ame_reg <- glm(paste0("reg ~ ", iv_inter),
                  data = response, family = binomial)



my_margin <-  function(md_ame, PARTLVL) {
  
  mar_erwerb <- summary(margins(md_ame, variables= c("grp"), at = list(erwerb = 1:2))) %>% 
    as.data.frame() %>%
    mutate(var = erwerb,
           var= ifelse(var == 1, "Employees", "Welfare Recipients")) %>% 
    dplyr::select(-erwerb)
  
  
  mar_ausbild <- summary(margins(md_ame, variables= c("grp"), at = list(ausbild = 1:6))) %>% 
    as.data.frame() %>%
    mutate(var = ausbild,
           var = ifelse(var == 1, "No degree", 
                        ifelse(var ==2, "Vocational training",
                               ifelse(var == 3, "High School diploma",
                                      ifelse(var == 4, "High School diploma\nand vocational training",
                                             ifelse(var == 5, "Higher education degree", "No information")))))) %>%
             dplyr::select(-ausbild)
  
  mar_sex <- summary(margins(md_ame, variables= c("grp"), at = list(sex = 0:1))) %>% 
    as.data.frame() %>%
    mutate(var = sex,
           var = ifelse(var ==0, "Male", "Female")) %>% dplyr::select(-sex)
  
  mar_staat <- summary(margins(md_ame, variables= c("grp"), at = list(staat = 0:1))) %>%
     as.data.frame() %>%
    mutate(var = staat,
           var = ifelse(var ==0, "Other", "German")) %>% dplyr::select(-staat)
  
  mar_agegr <- summary(margins(md_ame, variables= c("grp"), at = list(agegr = 1:5))) %>% 
    as.data.frame() %>%
    mutate(var = case_when(
      agegr == "1" ~ "18-28",
      agegr == "2" ~ "29-38",
      agegr == "3" ~ "39-48",
      agegr == "4" ~ "49-59",
      agegr == "5" ~ "59-64")) %>% 
    dplyr::select(-agegr)
  
  ame <- rbind(
    mar_erwerb,
    mar_ausbild,
    mar_sex,
    mar_staat,
    mar_agegr
  ) %>%
    mutate(partlvl = PARTLVL)

  ame
  
}


ame_teil <- my_margin(md_ame_teil, "Access")
ame_teil_voll <- my_margin(md_ame_teil_voll, "Complete Interview")
ame_reg <- my_margin(md_ame_reg, "Panel Registration")

#save results in a table and load it later. Makes code faster.
saveRDS(ame_teil, paste0(here(), "/05_DATA/ame_teil.rds"))
saveRDS(ame_teil_voll, paste0(here(), "/05_DATA/ame_teil_voll.rds"))
saveRDS(ame_reg, paste0(here(), "/05_DATA/ame_reg.rds"))

```

```{r eval=FALSE, include=FALSE}

lbl_vars <- c("**Employment Status**", "**Sex**", "**Educational attainment**", "**Citizenship**", "**Age**")


ame_sub <- rbind(
  full_join(ame_teil, data.frame(var = lbl_vars, partlvl = "Access")),
  full_join(ame_teil_voll, data.frame(var = lbl_vars, partlvl = "Complete Interview")),
  full_join(ame_reg, data.frame(var = lbl_vars, partlvl = "Panel Registration")))
 
ame_sub$var2 <- factor(ame_sub$var, levels = c("**Employment Status**", "Employees", "Welfare Recipients",
                                              "**Educational attainment**", "No degree", "Vocational training", "High School diploma", "High School diploma\nand vocational training", "Higher education degree", "No information",
                                              "**Age**", "18-28", "29-38", "39-48", "49-59", "59-64",
                                               "**Sex**", "Male", "Female",
                                       "**Citizenship**", "Other", "German"))

saveRDS(ame_sub, paste0(here(), "/05_DATA/ame_sub.rds"))

#save results in a table and load it later. Makes code faster.
ame_sub %>%
  mutate(stars = p_star(p),
         AME = round(AME, 3),
         SE = round(SE, 3)) %>%
  write.csv2(paste0(here(), "/05_DATA/ame_sub.csv"))

```

### Figures 2 an 3: Average Marginal Effects of password designs within subgroups {.tabset}

#### Figure 2 from Paper {.tabset}

Figure 2. Average marginal effects (points) of password design within subgroups by level of educational attainment and 95 percent confidence intervals (lines) from logistic regression models over different participation levels (i.e., access, complete interview, panel registration). The corresponding average marginal effect table can be found in Supplementary Material, Table S6.

```{r}

ame <- readRDS(paste0(here(), "/05_DATA/ame_sub.rds"))

ame_edu <- filter(ame, var %in% c("No degree", "Vocational training", "High School diploma", "High School diploma\nand vocational training", "Higher education degree", "No information"))


ggplot(ame_edu, aes(x = as.numeric(AME), group = factor(factor), shape = factor(factor), y = var2)) +
  geom_errorbar(aes(xmin = lower, xmax = upper),  width = 0.1, position = position_dodge(width = -0.7), 
                color = "snow3") +
  geom_point(position = position_dodge(width = -0.7), size = 3) +
  geom_vline(aes(xintercept=0), linetype= "dashed", color = "snow4") +
  scale_y_discrete(limits=rev) +
  scale_shape_manual(values = c(1,16,5,18), name = "",
                     labels = c("Short, simple", "Short, complex", "Long, simple", "Long, complex"),
                     na.translate = FALSE) +
  facet_wrap(facets = vars(partlvl),
             ncol = 3) +
  ylab("") +
  xlab("Average Marginal Effects (AME) with 95% Confidence Interval") +
   theme_bw() +
   theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom",
        )

ggsave(paste0(here(), "/07_IMG/", "Haas 24-0199.R2 Figure 2" , ".png"),
         width = 16, height = 18, units="cm")


```

#### Figure 3 from paper

Figure 3:  Average marginal effects (points) of password designs within subgroups (employment status, age, sex and citizenship) and 95% confidence intervals (lines) of logistic regression models over different participation levels (access, complete interview, panel registration). The corresponding average marginal effect table can be found in Supplementary Material, Table S6

```{r fig.width=unit(10, "cm"), fig.height=unit(14, "cm")}

ame_sub <- anti_join(ame, ame_edu) %>%
  filter(var != "**Educational attainment**")


ggplot(ame_sub, aes(x = as.numeric(AME), group = factor(factor), shape = factor(factor), y = var2)) +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0.1, position = position_dodge(width = -0.7), 
                color = "grey") +
  geom_point(position = position_dodge(width = -0.7)) +
  geom_vline(aes(xintercept=0), linetype= "dashed", color = "grey") +
  scale_y_discrete(limits=rev) +
  scale_shape_manual(values = c(1,16,5,18), name = "",
                     labels = c("Short, simple", "Short, complex", "Long, simple", "Long, complex"),
                     na.translate = FALSE) +
  scale_x_continuous(breaks = c(-0.02, 0.0, 0.02, 0.04)) +
  theme_bw() +
  facet_wrap(facets = vars(partlvl),
            ncol = 3) +
  ylab("") +
  xlab("Average Marginal Effects (AME) with 95% Confidence Interval") +
  theme(axis.text.y = ggtext::element_markdown(),
        panel.border = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "right",
        axis.text.x = element_text(size = 6)
        )

ggsave(paste0(here(), "/07_IMG/", "Haas 24-0199.R2 Figure 3" , ".png"),
         width = 16, height = 18, units="cm")

```

### Table S6 for the Supplementary Material

Table S6. Average marginal effects of passwords designs within subgroups (employment status, educational attainment, age, sex and citizenship) from logistic regression models over different participation levels (Access, complete interview, panel registration) – corresponding table to figure 2 and 3 in main text.

```{r}

func2 <- function(x) {
  
  ifelse(x== "NA (NA) , NA", "", x)
}


ame_sub_tab <-
ame %>%
  #mutate_at(vars(-var), func) %>%
  #mutate_all(ifelse(is.na(.x), "-", .x)) %>%
  mutate(stat = paste0(round(AME, 3), " (", round(SE, 3), ") , ",   ifelse(p < 0.001, "<0.001", round(p,3))),
         stat = ifelse(is.na(stat), "", stat)) %>%
  dplyr::select(var, factor, stat, partlvl) %>%
  spread(key = partlvl, value = stat) %>% 
  #dplyr::select(-"<NA>") %>%
  mutate_at(vars(-var), func) %>%
  mutate_at(vars(-factor), func2) %>%
  #mutate_all() NA (NA) , NA %>%
  mutate(var = factor(var, levels = c("**Employment Status**", "Employees", "Welfare Recipients",
                                              "**Educational attainment**", "No degree", "Vocational training", "High School diploma", "High School diploma\nand vocational training", "Higher education degree", "No information",
                                              "**Age**", "18-28", "29-38", "39-48", "49-59", "59-64",
                                               "**Sex**", "Male", "Female",
                                       "**Citizenship**", "Other", "German")),
         factor = ifelse(factor == "grp1", "Short, simple", 
                         ifelse(factor == "grp2", "Short, complex",
                                ifelse(factor == "grp3", "Long, simple",
                                       ifelse(factor == "grp4", "Long, complex", "")
                                       )))) %>%
  arrange(var) 

ame_sub_tab %>%
  MY_KABLE() 
  #kable()



ame_sub_tab %>%
  write.csv2(paste0(here(), "/05_DATA/ame_sub_tab.csv"))

```

## Data Linkage Consent {.tabset}


#### Table 4 for Paper

Table 4. Data linkage consent rates by experimental group for the sample of employees and welfare recipients.

```{r lc completes ieb}


left_join(
  #Linkage consent rate survey data sample
 filter(response_ieb, teil_voll == 1) %>%
  filter(!is.na(lc)) %>%
  rate_edit("lc") %>%
  rate_des(KABLE = FALSE),
 #Linkage consent rate invited sample
 filter(response_lhg, teil_voll == 1) %>%
  filter(!is.na(lc)) %>%
  rate_edit("lc") %>%
  rate_des(KABLE = FALSE),
 by = "password",
 suffix = c(".ieb", ".lhg")) %>%
  scramble_pw() %>%
  MY_KABLE() %>%
    add_footnote(paste0("Employee sample: ", chi(filter(response_ieb, teil_voll == 1), "lc")), notation = "none", escape = FALSE) %>%
   add_footnote(paste0("Unemployed sample: ", chi(filter(response_lhg, teil_voll == 1), "lc")), notation = "none", escape = FALSE) 


```

```{r}

save_stat(chi(filter(response_lhg, teil_voll == 1 & grp %in% c(0,1)), "lc" ))



```

**Test** linkage consent difference between control and short, simple for lhg sample: `r write_stat()`

### Table S7 for Supplementary Material (logistic regression)

Table S7. Average marginal effects, Standard errors (s.e.) and p-values from logistic regression models of data linkage consent for complete interviews of the Employee sample (N = 12,166) and for complete interviews of the Welfare Recipient sample (N = 5,740).	

```{r eval=FALSE, include=FALSE}

ame_lc_full <- data.frame()

#its kinda late... 
  Y <- list(response, response_ieb, response_lhg)
  Ylab <- c("response", "response_ieb", "response_lhg")

for(y in c(1,2,3)) {
  
  for(x in c("teil", "teil_voll", "reg")) {
  
    if(y == 1){
    md_ame_lc <- glm("lc ~ factor(grp) + factor(erwerb) + factor(ausbild) + factor(sex)  + factor(staat) + factor(regd) + factor(agegr) + factor(exprlc) + factor(druckerei)", data = filter(Y[[y]], .data[[x]] ==1), family = binomial)
    
        ame_lc<- summary(margins(md_ame_lc, variables= c("grp", "erwerb", "ausbild",  "sex", "staat", "agegr")))
    
    }else{
      md_ame_lc <- glm("lc ~ factor(grp) + factor(ausbild) + factor(sex)  + factor(staat) + factor(regd) + factor(agegr) + factor(exprlc) + factor(druckerei)", data = filter(Y[[y]], .data[[x]] ==1), family = binomial)
      
          ame_lc<- summary(margins(md_ame_lc, variables= c("grp", "ausbild",  "sex", "staat", "agegr")))
      
    }
      
  
    ame_lc$part_level <- x 
    ame_lc$data <- Ylab[y]
    
    ame_lc_full <- rbind(ame_lc_full, ame_lc)
    
  }  
  
}
  
saveRDS(ame_lc_full, paste0(here(), "/05_DATA/ame_lc.rds"))

ame_lc_full %>%
  mutate(stars = p_star(p),
         AME = round(AME, 3),
         SE = round(SE, 3)) %>%
  write.csv2(paste0(here(), "/05_DATA/ame_lc.csv"))

MY_KABLE(ame_lc_full)


 md_ame_lc <- glm("lc ~ factor(grp) + factor(ausbild) + factor(sex)  + factor(staat) + factor(regd) + factor(agegr) + factor(exprlc) + factor(druckerei)", data = filter(response_ieb, teil_voll ==1), family = binomial)
    
 names(summary(md_ame_lc))
 nrow(filter(response_lhg, teil_voll ==1))

 
```



```{r}

ame_lc <- readRDS( paste0(here(), "/05_DATA/ame_lc.rds"))


refs <- c("agegr1", "ausbild1", "erwerb1", "grp0", "sex0", "staat0")

ame_lc <- full_join(ame_lc, data.frame(factor = refs))


old_var <- names(table(ame_lc$factor))
new_var <- c("**Age (Ref: 18-28)**", "29-38", "39-48", "49-59", "59-64",
             "**Educational attainment**<br>**(Ref: No degree)**", "Vocational training", "High School diploma", "High School diploma<br>and vocational training", "Higher education degree", "No information",
             "**Employment Status (Ref: Employees)**", "Welfare Recipients",
             "**Password strength (Ref: Control)**", "Short, simple", "Short, complex", "Long, simple", "Long, complex",
             "**Sex (Ref: Male)**", "Female",
             "**Citizenship (Ref: Other)**", "German"
             )


# loop to recode labels
for (x in seq(1,length(new_var),1)){
  
  ame_lc$factor <- ifelse(ame_lc$factor == old_var[x], new_var[x], ame_lc$factor)
}

ame_lc$factor <- factor(ame_lc$factor, levels = 
                            c("**Password strength (Ref: Control)**", "Short, simple", "Short, complex", "Long, simple", "Long, complex",
                               "**Employment Status (Ref: Employees)**", "Welfare Recipients",
                              "**Educational attainment**<br>**(Ref: No degree)**", "Vocational training", "High School diploma", "High School diploma<br>and vocational training", "Higher education degree", "No information",
                              "**Age (Ref: 18-28)**", "29-38", "39-48", "49-59", "59-64",
                              "**Sex (Ref: Male)**", "Female",
                              "**Citizenship (Ref: Other)**", "German"
                               ))

# the complete table
# ame_lc %>%
#    MY_KABLE()


ame_lc$part_level <- factor(ame_lc$part_level, levels = c("teil", "teil_voll", "reg"))
ame_lc$data <- ifelse(ame_lc$data == "response_ieb", "Employees", ifelse(ame_lc$data == "response", "response", "Welfare Recipients"))

ame_lc_tab <- 
  filter(ame_lc, part_level == "teil_voll" & factor %in% c("Short, simple", "Short, complex", "Long, simple", "Long, complex")) %>%
  mutate(stat = paste0(round(AME, 3), " (", round(SE, 3), ") , ",   ifelse(p < 0.001, "<0.001", round(p,3))),
         stat = ifelse(is.na(stat), "", stat)) %>%
  dplyr::select(factor, stat, data)

cbind(
dplyr::select(filter(ame_lc_tab, data == "Employees"), -data), 
dplyr::select(filter(ame_lc_tab, data == "Welfare Recipients"), stat)) %>%
  MY_KABLE()
  
  


```


## Response to income question


```{r}

w1 <- left_join(w1, 
                dplyr::select(response, opalid, grp, sample), 
                              by =  "opalid")


w1_ieb <- filter(w1, sample == "ieb")
w1_lhg <- filter(w1, sample == "lhg")

```


Table 5. Item response rates for gross and net income questions by experimental groups for the respondent samples of employees and welfare recipients.

```{r}
#aa1ze450  gross income
#aa1ze500 net income

#build funtions for similar to rate_edit and rate_des for income nonresponse 
edit_inc <- function(DATA) {

dt <- DATA %>%
  mutate(incb = ifelse(aa1ze450== -88, NA, 
                      ifelse(aa1ze450 >= 0, 1,0)),
         incn = ifelse(aa1ze500== -88, NA, 
                      ifelse(aa1ze500 >= 0, 1,0)))

dt  

}

w1_ieb <- edit_inc(w1_ieb)
w1_lhg <- edit_inc(w1_lhg)

rate_inc <- function(DATA, VAR) {
  
  lbl_grp <- c("control", "short, simple", "short, complex", "long, simple", "long, complex")
  
DATA %>%
  group_by(grp) %>%
   summarize(prop = mean(.data[[VAR]], na.rm =TRUE),  
             n = sum(!is.na(.data[[VAR]])),
             se = sqrt(prop*(1-prop)/n)*100,
             rr = prop*100) %>% 
  cbind(lbl_grp) %>%
  mutate(diff = round(rr - .data$rr[.data$grp==0],1),
         c =  .data$prop[.data$grp==0],
         cn = .data$n[.data$grp==0],
         se_diff = round(sqrt(prop * (1 - prop) /n +  
                           c*(1 - c) / cn)*100,1),
         rate = round(rr,1), 
         se = round(se,1)) %>%
    rowwise() %>%
    mutate(chi = prop.test(c(prop*n, c*cn), c(n, cn))$statistic,
           p = prop.test(c(prop*n, c*cn), c(n, cn))$p.value
           ) %>%
    ungroup() %>% 
  mutate(,
         p =sub("^0", "", sprintf("%.3f", p))) %>%
    dplyr::select(lbl_grp, n, rate, se, diff, se_diff, p)
  
}

table_inc <- function(DATA) {
  cbind(
      rate_inc(DATA, "incb"),
      rate_inc(DATA, "incn")
)
}
 
rbind(table_inc(w1_ieb), table_inc(w1_lhg)) %>% MY_KABLE()

```


```{r echo = TRUE} 
chi(w1_ieb, "incb")
chi(w1_ieb, "incn")

chi(w1_lhg, "incb")
chi(w1_lhg, "incn")


```


